# Directory structure
github_dir <- file.path("G:/Shared drives/Nord Lab - Computational Projects/MAR_P2_bulk_RNA-seq")
setwd(github_dir)
# Global R markdown code chunk options
knitr::opts_chunk$set(message=FALSE,
warning = FALSE,
error=FALSE,
echo=TRUE,
cache=TRUE,
fig.width = 7, fig.height = 6,
fig.align = 'left')
# fastq files were transfered to our GCP computer: /mnt/disks/data2/MAR
# The previous batch of this experiment is located at fastq/MAR1. fastq/MAR2 contains the new batch.
fastqc \
--threads 15 \
--outdir /mnt/disks/data2/MAR/fastQC \
/mnt/disks/data2/MAR/fastq/MAR*/*q.gz
# Alignments to Ensembl Rnor_6.0 and UCSC Rn7
# Links:
# http://ftp.ensembl.org/pub/release-104/gtf/rattus_norvegicus/Rattus_norvegicus.Rnor_6.0.104.gtf.gz
# http://ftp.ensembl.org/pub/release-104/fasta/rattus_norvegicus/dna/Rattus_norvegicus.Rnor_6.0.dna.toplevel.fa.gz
# https://hgdownload.soe.ucsc.edu/goldenPath/rn7/bigZips/rn7.fa.gz
# https://hgdownload.soe.ucsc.edu/goldenPath/rn7/bigZips/genes/
# Index generation
STAR \
--runMode genomeGenerate \
--runThreadN 15 \
--genomeDir /mnt/disks/data2/genomes/Rat/rn7_UCSC/STAR_index/ \
--genomeFastaFiles /mnt/disks/data2/genomes/Rat/rn7_UCSC/rn7.fa \
--sjdbGTFfile /mnt/disks/data2/genomes/Rat/rn7_UCSC/refGene.gtf \
--sjdbOverhang 149
STAR \
--runMode genomeGenerate \
--runThreadN 15 \
--genomeDir /mnt/disks/data2/genomes/Rat/rn7_UCSC/STAR_index_ncbi/ \
--genomeFastaFiles /mnt/disks/data2/genomes/Rat/rn7_UCSC/rn7.fa \
--sjdbGTFfile /mnt/disks/data2/genomes/Rat/rn7_UCSC/ncbiRefSeq.gtf \
--sjdbOverhang 149
STAR \
--runMode genomeGenerate \
--runThreadN 15 \
--genomeDir /mnt/disks/data2/genomes/Rat/Rnor_6.0_Ref/STAR_index/ \
--genomeFastaFiles /mnt/disks/data2/genomes/Rat/Rnor_6.0_Ref/Rattus_norvegicus.Rnor_6.0.dna.toplevel.fa \
--sjdbGTFfile /mnt/disks/data2/genomes/Rat/Rnor_6.0_Ref/Rattus_norvegicus.Rnor_6.0.104.gtf \
--sjdbOverhang 149
# Extract unique sample names for MAR1
ls /mnt/disks/data2/MAR/fastq/MAR1/*gz | sed -e 's/\.fastq.gz$//' | sed -e 's/1_001\|2_001$//' | uniq
# Extract unique sample names for MAR2
ls /mnt/disks/data2/MAR/fastq/MAR2/*gz | sed -e 's/\.fq.gz$//' | sed -e 's/_1\|_2$//' | uniq
# Defines an array with sample names,
# From within the MAR1 fastq dir:
samples=($(ls *gz | sed -e 's/\.fastq.gz$//' | sed -e 's/1_001\|2_001$//' | uniq))
# From within the MAR2 fastq dir:
samples=($(ls *gz | sed -e 's/\.fq.gz$//' | sed -e 's/_1\|_2$//' | uniq))
# Execute scripts in for loops:
for i in "${samples[@]}"; do ./alignment_Rnor6.sh $i > ./logs/$i.align_log.txt 2>&1; done
for i in "${samples[@]}"; do ./alignment_Rnor6_2.sh $i > ./logs/$i.align2_log.txt 2>&1; done
for i in "${samples[@]}"; do ./alignment2_rn7_UCSC.sh $i > ./logs/$i.align2_log.txt 2>&1; done
# alignment.sh script is included in this repository
# From within the bam directory
samples=($(ls *bam | sed -e 's/_Aligned.sortedByCoord.out.bam$//'))
echo "${samples[@]}"
for i in "${samples[@]}"; do ./dedup_featureCounts.sh $i > ./logs/$i.dedup_FC_log.txt 2>&1; done
for i in "${samples[@]}"; do ./dedup_featureCounts_UCSC.sh $i > ./logs/$i.dedup_FC_UCSC_log.txt 2>&1; done
setwd("G:/Shared drives/Nord Lab - Computational Projects/MAR_P2_bulk_RNA-seq")
metadata <- read.csv("MAR_metadata_bulk_combined_updated_081921.csv")
### Read UCSC data
Rn7_files <- list.files("./featureCount_UCSC_Rn7/")
fc_list2 <- lapply(Rn7_files, function(x){
read.table(paste0("./featureCount_UCSC_Rn7/", x), header = TRUE)
})
fc_df2 <- Reduce(function(x, y) merge(x, y, by=c(colnames(fc_list2[[1]])[1:6])), fc_list2)
fc2_colnames <- gsub("X.mnt.disks.data2.MAR.bam_sorted_dedup_dir_UCSC.", "", colnames(fc_df2))
fc2_colnames <- gsub(".bam", "", fc2_colnames)
colnames(fc_df2) <- fc2_colnames
# Manually order count df to match metadata
fc_df2 <- fc_df2[,c("Geneid", "Chr", "Start", "End", "Strand", "Length",
"1_S2_L002_R",
"2_S10_L003_R",
"3_S3_L002_R",
"4_S4_L002_R",
"5_S11_L003_R",
"6_S12_L003_R",
"7_S13_L003_R",
"8_S5_L002_R",
"9_S14_L003_R",
"10_S6_L002_R",
"11_S7_L002_R",
"12_S8_L002_R",
"13_S15_L003_R",
"14_S9_L002_R",
"15_S16_L003_R",
"IgG1_1",
"IgG1_2",
"IgG1_3",
"IgG1_4",
"IgG2_1",
"IgG2_2",
"IgG2_3",
"IgG2_4",
"RNA4_1",
"RNA4_2",
"RNA4_3",
"RNA4_4",
"IgG3_1",
"IgG3_2",
"IgG3_3",
"IgG3_4",
"FcRn4_1",
"FcRn4_2",
"FcRn4_3",
"FcRn4_4",
"RNA1_1",
"RNA1_2",
"RNA1_3",
"RNA1_4",
"RNA2_1",
"RNA2_2",
"RNA2_3",
"RNA2_4")]
fc_df_UCSC_Rn7 <- fc_df2
# Removing objects to limit ambiguity in the environment.
rm(fc_df2)
#rm(fc_df)
Reads were aligned to UCSC Rn7 and Ensembl geneme Rnor_6.0 genome. The Ensembl genome produced 20x less mapped reads than alignment using the UCSC genome. There may be something wrong with the reference genome I constructed or the processing pipeline. The previous version of this analysis used Ensembl Rnor genome, which yielded the expected alignment statistics. I think that genome was downloaded from the Illumina iGenomes repository.
For the current analysis, I use the UCSC Rn7 genome only. Batch 2 was generated using polyA-enrichment technique, resulting in lower transcriptomic complexity, that may be more suited for the UCSC genome, which lacks some non-coding and predicted transcript annotations. Results of the Batch 1 DE are very similar in the current and earlier version of this analysis using the Ensembl reference.
library(ggplot2)
library(reshape2)
library(cowplot)
metadata$Mapped_reads_UCSC_Rn7 <- colSums(fc_df_UCSC_Rn7[,c(7:49)])
metadata$RNA_seq_batch <- as.factor(metadata$RNA_seq_batch)
ggplot(metadata, aes(x = RNA_ID, y = Mapped_reads_UCSC_Rn7 / 10^6, color = RNA_seq_batch))+
geom_point()+
theme_cowplot()+
labs(x = "Samples",
y = "Mapped reads [Millions]",
title = "Batch 1 and Batch 2 are sequenced to similar depth")+
theme(plot.title = element_text(size = rel(0.9)))
# Sanity check of the old batch 1 count file
old_batch1 <- read.table("G:/Shared drives/Nord Lab - Computational Projects/MAA-P2/feature_counts.txt",
header = TRUE)
old_batch1_colnames <- gsub("X.share.nordlab.users.kcichewicz.MAA.scripts.STAR_alignment.", "",
colnames(old_batch1))
old_batch1_colnames <- gsub(".bam", "",
old_batch1_colnames)
colnames(old_batch1) <- old_batch1_colnames
#colSums(old_batch1[,c(7:21)]) / 10^6
# Values are within the ballpark of the recent UCSC alignment: 32, 33, 29, 31, 44, 42 Millions of reads. They are ~50% higher than the UCSC alignment due to the presence of predicted genes.
#dplyr::filter(fc_df_UCSC_Rn7, Geneid == "Rn45s")[,c(7:49)]
metadata$Rn45s_reads <- as.numeric(dplyr::filter(fc_df_UCSC_Rn7, Geneid == "Rn45s")[,c(7:49)])
metadata$Fraction_of_rRNA <- metadata$Rn45s_reads / metadata$Mapped_reads_UCSC_Rn7
metadata$RNA_seq_batch <- as.factor(metadata$RNA_seq_batch)
ggplot(metadata, aes(x = RNA_ID, y = Fraction_of_rRNA,
color = RNA_seq_batch))+
geom_point()+
theme_cowplot()+
labs(x = "Samples",
y = "Fraction of rRNA reads \n inferred from Rn45s counts",
title = "Batch 1 has more rRNA reads because it was processed using rRNA depletion \n instead of polyA enrichment. The overall amount of rRNA reads is not concerning")+
theme(plot.title = element_text(size = rel(0.9)))
# In Rn7 Xist is annotated as LOC100911498
# https://genome.ucsc.edu/cgi-bin/hgc?hgsid=1231341959_KptCXTNCacrAQmfv5IGbxvCeUzFY&db=rn7&c=chrX&l=68474474&r=68497336&o=68474986&t=68492500&g=ncbiRefSeqCurated&i=NR_132635.1
metadata$Xist_reads <- as.numeric(dplyr::filter(fc_df_UCSC_Rn7, Geneid == "LOC100911498")[,c(7:49)])
metadata$Fraction_of_Xist <- metadata$Xist_reads / metadata$Mapped_reads_UCSC_Rn7
ggplot(metadata, aes(x = RNA_ID, y = Fraction_of_Xist,
color = Sex,
shape = RNA_seq_batch))+
geom_point()+
theme_cowplot()+
labs(x = "Samples",
y = "Fraction of Xist reads",
title = "Xist expression confirms correct sex annotation in the metadata")+
theme(plot.title = element_text(size = rel(0.9)))
library(edgeR)
counts <- fc_df_UCSC_Rn7
min.cpm.criteria = 1 # a reasonable rule-of-thumb threshold
test.data <- counts[, 7:49]
rownames(test.data) <- counts$Geneid
test.samples <- 1:nrow(metadata)
min.cpm <- min.cpm.criteria
y <- DGEList(counts=test.data, group=metadata$genotype)
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group
y <- y[keep, , keep.lib.sizes=FALSE]
y <- calcNormFactors(y) # Normalizes for RNA composition
y <- estimateCommonDisp(y) # Estimates common dispersions. Calculates pseudo-counts, a type of normalized counts. Don't misteke them with 0+x type counts from other packages. Also "users are advised not to interpret the psuedo-counts as general-purpose normalized counts".
y <- estimateTagwiseDisp(y) #Estimates dispersions. Applicable only to experiments with single factor.
#Alternatively the two commands from above can be replaced with y <- estimateDisp(y)
#metadata <- arrange(metadata, counts_colnames)
#MDS plot using ggplot.
MDS_data1 <- plotMDS(y, plot = FALSE, dim.plot = c(1, 2))
MDS_data2 <- plotMDS(y, plot = FALSE, dim.plot = c(3, 4))
MDS_data3 <- plotMDS(y, plot = FALSE, dim.plot = c(5, 6))
MDS_data4 <- plotMDS(y, plot = FALSE, dim.plot = c(7, 8))
MDS_data5 <- plotMDS(y, plot = FALSE, dim.plot = c(9, 10))
MDS_data2 <- data.frame(Leading_logFC_dim_1 = MDS_data1$x,
Leading_logFC_dim_2 = MDS_data1$y,
Leading_logFC_dim_3 = MDS_data2$x,
Leading_logFC_dim_4 = MDS_data2$y,
Leading_logFC_dim_5 = MDS_data3$x,
Leading_logFC_dim_6 = MDS_data3$y,
Leading_logFC_dim_7 = MDS_data4$x,
Leading_logFC_dim_8 = MDS_data4$y,
Leading_logFC_dim_9 = MDS_data5$x,
Leading_logFC_dim_10 = MDS_data5$y)
MDS_data2 <- data.frame(Samples=rownames(MDS_data1$distance.matrix.squared), MDS_data2, metadata)
#Sanity check
#all(MDS_data2$Samples == MDS_data2$counts_colnames)
rownames(MDS_data2) <- NULL
MDS_data2$Animal <- as.factor(MDS_data2$Animal)
# MDS Plot with colored DPC
#Condition and sex
point_size = 3
MDS_plot <- function(variable, dim_x, dim_y){
ggplot(MDS_data2, aes(x=get(paste0("Leading_logFC_dim_", dim_x)),
y=get(paste0("Leading_logFC_dim_", dim_y)),
colour=get(variable),
shape=RNA_seq_batch,
label = Samples))+
geom_point(size=point_size, alpha=0.5)+
theme_bw()+
labs(title = "MDS plot", color = variable,
x = paste0("Dim_", dim_x),
y = paste0("Dim_", dim_y))+
theme(plot.title = element_text(size = rel(1.2), hjust=0.5))+
theme(axis.text.x = element_text(size=12), text = element_text(size=12))+
theme(legend.position = "bottom")
}
plot_grid(MDS_plot("Condition", 1, 2),
MDS_plot("Condition", 3, 4),
MDS_plot("Condition", 5, 6),
MDS_plot("Condition", 7, 8),
nrow = 2)
Multidimensional scaling separates RNA-seq batches along the 1st dimension MDS does not suggest presence of outlier samples
plot_grid(MDS_plot("Sex", 1, 2),
MDS_plot("Sex", 3, 4),
MDS_plot("Sex", 5, 6),
MDS_plot("Sex", 7, 8),
nrow = 2)
Multidimensional scaling separates RNA-seq samples by sex along the 3rd and 4th dimension MDS does not suggest presence of outlier samples
# Centering the PCA doesn't have any major effect on clustering. It just flips Sample 1 and Sample 2 alongPC2.
# Scaling emphasizes batch effects along PC1, but does not separate batches completely. It makes sense because it reduces the influence of sequencing depth(color seq depth on PCs?).
# PC2 resolves batch effects regardless of centering and scaling.
# PC1 accounts for ~97 percent of the variance regardless of centering and scaling.
pca_data <- prcomp(counts[, 7:49], center = TRUE, scale = TRUE)
pca_data_var_expl <- pca_data$sdev^2 / sum(pca_data$sdev^2)
scree <- data.frame("PC" = paste0(seq(1:length(pca_data_var_expl))), "Var_exp" = pca_data_var_expl)
pca_data <- as.data.frame(pca_data$rotation)
pca_data$Sample <- colnames(pca_data)
pca_data <- data.frame(metadata, pca_data)
# save.image("G:/Shared drives/Nord Lab - Computational Projects/MAR_P2_integrated_batches_2021/session_12_08_21.RData")
scree$PC <- as.numeric(scree$PC)
ggplot(scree[1:5,], aes(x = PC, y = Var_exp))+
geom_point()+
geom_line()+
scale_y_continuous(breaks=seq(0, 1, 0.1))+
theme_bw()+
labs(title = "Scree plot", x = "PC", y = "Variance explained")+
theme(plot.title = element_text(hjust = 0.5))
PC1 accounts for 97.4% of variation in the data
PCA_plot <- function(dim_x, dim_y, title){
ggplot(pca_data, aes(x = get(dim_x), y = get(dim_y), color = RNA_seq_batch, shape = Sex))+
geom_point(alpha = 0.7, size = 5)+
theme_bw()+
labs(title = title, x = dim_x, y = dim_y)+
theme(plot.title = element_text(hjust = 0.5))
}
plot_grid(PCA_plot("PC1", "PC2", "PC1 vs PC2"),
PCA_plot("PC3", "PC4", "PC3 vs PC4"),
nrow = 2)
# Top genes driving PC1 and PC2 = PC loadings
pca_data <- prcomp(counts[, 7:49], center = TRUE, scale = TRUE)
pca_genes <- pca_data$x
rownames(pca_genes) <- counts$Geneid
PC1_genes <- data.frame(sort(abs(pca_genes[,"PC1"]), decreasing=TRUE)[1:200])
PC2_genes <- data.frame(sort(abs(pca_genes[,"PC2"]), decreasing=TRUE)[1:200])
names(PC1_genes) <- "PC1_loadings"
names(PC2_genes) <- "PC2_loadings"
knitr::kable(head(PC1_genes, 10))
| PC1_loadings | |
|---|---|
| Map1b | 242.8209 |
| Tuba1a | 175.3723 |
| Tubb2b | 158.5949 |
| Tubb5 | 143.3721 |
| Actb | 125.3359 |
| Mapt | 120.7521 |
| Tubb3 | 118.7766 |
| Dpysl2 | 117.6160 |
| Eef1a1 | 114.5683 |
| Dcx | 106.9519 |
# Rn45s is the top gene in PC2
knitr::kable(head(PC2_genes, 10))
| PC2_loadings | |
|---|---|
| Rn45s | 46.26921 |
| Map1b | 44.11745 |
| LOC310926 | 28.00099 |
| Actb | 16.13620 |
| Dpysl2 | 14.59126 |
| Actg1 | 14.24745 |
| Ncan | 13.40974 |
| Rpph1 | 13.18969 |
| Tubb2b | 11.29899 |
| Macf1 | 11.27942 |
This is looking great. Replication-dependent histones are not polyadenylated. They are present in the rRNA-depletd samples (Batch 1) but not in the polyA-enriched samples (Batch 2).
library(DT)
# Run DE comparing the two batches
#metadata
counts <- fc_df_UCSC_Rn7
metadata$Count_colnames <- colnames(counts)[7:49]
source("DE_function_comparing_batches.R")
DE_of_batches <- DE_function_comparing_batches()
datatable(DE_of_batches, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
knitr::kable(melt(dplyr::filter(counts, Geneid == "Hist1h4m")))
| Geneid | Chr | Start | End | Strand | variable | value |
|---|---|---|---|---|---|---|
| Hist1h4m | chr17 | 42698054 | 42698365 | + | Length | 312 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 1_S2_L002_R | 2533 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 2_S10_L003_R | 2137 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 3_S3_L002_R | 2098 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 4_S4_L002_R | 2318 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 5_S11_L003_R | 2530 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 6_S12_L003_R | 2017 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 7_S13_L003_R | 1486 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 8_S5_L002_R | 1898 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 9_S14_L003_R | 2426 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 10_S6_L002_R | 2231 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 11_S7_L002_R | 2227 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 12_S8_L002_R | 1622 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 13_S15_L003_R | 2693 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 14_S9_L002_R | 1616 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | 15_S16_L003_R | 1523 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG1_1 | 2 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG1_2 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG1_3 | 2 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG1_4 | 2 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG2_1 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG2_2 | 2 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG2_3 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG2_4 | 4 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA4_1 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA4_2 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA4_3 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA4_4 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG3_1 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG3_2 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG3_3 | 2 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | IgG3_4 | 2 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | FcRn4_1 | 2 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | FcRn4_2 | 2 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | FcRn4_3 | 2 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | FcRn4_4 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA1_1 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA1_2 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA1_3 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA1_4 | 4 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA2_1 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA2_2 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA2_3 | 0 |
| Hist1h4m | chr17 | 42698054 | 42698365 | + | RNA2_4 | 0 |
# GO enrichment in 102 DE genes between batch 1 and batch 2.
# dplyr::filter(DE_of_batches, FDR < 1e-80) # 102 genes
#source("GO_analysis.R")
#Batches_GO <- GO_analysis(DE_of_batches, 8744)
# Top GO categories:
# nucleosome assembly
# DNA replication-dependent nucleosome ass...
# chromatin silencing
# innate immune response in mucosa
# negative regulation of megakaryocyte dif...
# nucleosome positioning
### GO BP for 100 PC1 and PC2 loading genes
source("GO_analysis_input_genes.R")
PC1_GO <- GO_analysis_input_genes(DE_of_batches, rownames(PC1_genes)[1:100], 8744)
##
## ------------------------- topGOdata object -------------------------
##
## Description:
## - My project
##
## Ontology:
## - BP
##
## 12610 available genes (all genes from the array):
## - symbol: Hist1h4m Rn7sl1 H1f1 Hist2h4 Hist1h2bg ...
## - 100 significant genes.
##
## 11357 feasible genes (genes that can be used in the analysis):
## - symbol: Hist1h4m H1f1 Hist2h4 Hist1h2bg H1f5 ...
## - 100 significant genes.
##
## GO graph (nodes with at least 5 genes):
## - a graph with directed edges
## - number of nodes = 8744
## - number of edges = 19590
##
## ------------------------- topGOdata object -------------------------
PC2_GO <- GO_analysis_input_genes(DE_of_batches, rownames(PC2_genes)[1:100], 8744)
##
## ------------------------- topGOdata object -------------------------
##
## Description:
## - My project
##
## Ontology:
## - BP
##
## 12610 available genes (all genes from the array):
## - symbol: Hist1h4m Rn7sl1 H1f1 Hist2h4 Hist1h2bg ...
## - 100 significant genes.
##
## 11357 feasible genes (genes that can be used in the analysis):
## - symbol: Hist1h4m H1f1 Hist2h4 Hist1h2bg H1f5 ...
## - 97 significant genes.
##
## GO graph (nodes with at least 5 genes):
## - a graph with directed edges
## - number of nodes = 8744
## - number of edges = 19590
##
## ------------------------- topGOdata object -------------------------
knitr::kable(head(PC1_GO[[1]], 20))
| GO.ID | Term | Annotated | Significant | Expected | classicFisher |
|---|---|---|---|---|---|
| GO:0047497 | mitochondrion transport along microtubul… | 20 | 5 | 0.18 | 1.9e-07 |
| GO:0045773 | positive regulation of axon extension | 50 | 7 | 0.44 | 2.4e-07 |
| GO:0001764 | neuron migration | 156 | 11 | 1.37 | 3.7e-07 |
| GO:0000226 | microtubule cytoskeleton organization | 480 | 21 | 4.23 | 1.0e-06 |
| GO:0021762 | substantia nigra development | 41 | 6 | 0.36 | 1.4e-06 |
| GO:0007409 | axonogenesis | 415 | 28 | 3.65 | 3.9e-06 |
| GO:0000278 | mitotic cell cycle | 714 | 19 | 6.29 | 6.3e-06 |
| GO:0007026 | negative regulation of microtubule depol… | 31 | 5 | 0.27 | 6.8e-06 |
| GO:0032232 | negative regulation of actin filament bu… | 27 | 4 | 0.24 | 1.3e-05 |
| GO:0098974 | postsynaptic actin cytoskeleton organiza… | 20 | 4 | 0.18 | 2.5e-05 |
| GO:0048675 | axon extension | 129 | 15 | 1.14 | 2.9e-05 |
| GO:0042493 | response to drug | 492 | 15 | 4.33 | 5.9e-05 |
| GO:0071353 | cellular response to interleukin-4 | 25 | 4 | 0.22 | 6.2e-05 |
| GO:1990090 | cellular response to nerve growth factor… | 70 | 7 | 0.62 | 6.8e-05 |
| GO:0051693 | actin filament capping | 31 | 4 | 0.27 | 0.00010 |
| GO:0072321 | chaperone-mediated protein transport | 11 | 3 | 0.10 | 0.00010 |
| GO:0061635 | regulation of protein complex stability | 12 | 3 | 0.11 | 0.00014 |
| GO:0014049 | positive regulation of glutamate secreti… | 13 | 3 | 0.11 | 0.00018 |
| GO:0031915 | positive regulation of synaptic plastici… | 13 | 3 | 0.11 | 0.00018 |
| GO:0061684 | chaperone-mediated autophagy | 13 | 3 | 0.11 | 0.00018 |
knitr::kable(head(PC2_GO[[1]], 20))
| GO.ID | Term | Annotated | Significant | Expected | classicFisher |
|---|---|---|---|---|---|
| GO:0001764 | neuron migration | 156 | 10 | 1.33 | 2.2e-07 |
| GO:0021762 | substantia nigra development | 41 | 6 | 0.35 | 1.2e-06 |
| GO:0007026 | negative regulation of microtubule depol… | 31 | 5 | 0.26 | 5.8e-06 |
| GO:0042493 | response to drug | 492 | 14 | 4.20 | 4.7e-05 |
| GO:0050772 | positive regulation of axonogenesis | 90 | 8 | 0.77 | 8.7e-05 |
| GO:0072321 | chaperone-mediated protein transport | 11 | 3 | 0.09 | 9.5e-05 |
| GO:0021766 | hippocampus development | 109 | 7 | 0.93 | 0.00012 |
| GO:0061635 | regulation of protein complex stability | 12 | 3 | 0.10 | 0.00013 |
| GO:1990090 | cellular response to nerve growth factor… | 70 | 6 | 0.60 | 0.00015 |
| GO:0061684 | chaperone-mediated autophagy | 13 | 3 | 0.11 | 0.00016 |
| GO:0099566 | regulation of postsynaptic cytosolic cal… | 14 | 3 | 0.12 | 0.00021 |
| GO:0000281 | mitotic cytokinesis | 64 | 5 | 0.55 | 0.00021 |
| GO:0007411 | axon guidance | 199 | 14 | 1.70 | 0.00022 |
| GO:0010942 | positive regulation of cell death | 574 | 7 | 4.90 | 0.00028 |
| GO:0031115 | negative regulation of microtubule polym… | 16 | 3 | 0.14 | 0.00031 |
| GO:0048675 | axon extension | 129 | 11 | 1.10 | 0.00036 |
| GO:0042220 | response to cocaine | 74 | 5 | 0.63 | 0.00041 |
| GO:0032516 | positive regulation of phosphoprotein ph… | 19 | 3 | 0.16 | 0.00053 |
| GO:0047497 | mitochondrion transport along microtubul… | 20 | 3 | 0.17 | 0.00062 |
| GO:0007409 | axonogenesis | 415 | 27 | 3.54 | 0.00069 |
Calculate RPKM values
# Calculate RPKM
### Generate Rn6 list of exon sizes
# Method description at: https://www.biostars.org/p/83901/
library(GenomicFeatures)
txdb <- makeTxDbFromGFF("refGene.gtf", format="gtf") #The file is too big to be uploaded to GitHub.
exons.list.per.gene <- exonsBy(txdb, by="gene")
# Parallelized, increasing the speed >2x on a 4-core (logical) machine.
# Use mclapply instead of parLapply if you use a Mac.
cl <- makeCluster(detectCores())
exonic.gene.sizes <- parallel::parLapply(cl, exons.list.per.gene,function(x){sum(width(reduce(x)))})
stopCluster(cl)
# Calculate RPKM
exp.data <- counts[,7:49]
rownames(exp.data) <- counts$Geneid
# Sanity check = PASSED
#all(names(exonic.gene.sizes) == rownames(exp.data))
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))
rpkm.data_log <- as.data.frame(rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25))
rpkm.data_linear <- as.data.frame(rpkm(exp.data, gene.length=gene.lengths, log=F))
#rownames(counts) <- counts$Geneid
#counts <- counts[,7:49]
#counts <- as.matrix(counts)
# CPM threshold of 1
# Sex is a covariate
# Non-batch corrected
source("DE_function.R")
DE_b1 <- DE_function(1)
DE_b2 <- DE_function(2)
DE_b1_and_2 <- DE_function(c(1,2))
# Batch corrected
# With CPM threshold of 1
# Sex is a covariate
source("DE_function_batch.R")
DE_b1_and_2_batch_cor <- DE_function_batch(c(1,2))
### DE analysis without sex covariate
source("DE_function_no_sex.R")
DE_b1_no_sex <- DE_function_no_sex(1)
DE_b2_no_sex <- DE_function_no_sex(2)
DE_b1_and_2_no_sex <- DE_function_no_sex(c(1,2))
source("DE_function_sex_specific.R")
DE_b1_Male <- DE_function_sex_specific(1, "M")
DE_b2_Male <- DE_function_sex_specific(2, "M")
DE_b1_and_2_Male <- DE_function_sex_specific(c(1,2), "M")
DE_b1_Female <- DE_function_sex_specific(1, "F")
DE_b2_Female <- DE_function_sex_specific(2, "F")
DE_b1_and_2_Female <- DE_function_sex_specific(c(1,2), "F")
### DE expression with PCAs as covariates ###
# I'm including both replicates because PCA was run on the complete dataset
#source("DE_function_w_PCA.R")
#DE_b1_and_2_test <- DE_function_w_PCA(c(1,2))
#head(DE_b1_and_2_test$FDR)
#sum(DE_b1_and_2_test$FDR < 0.1)
# N of genes passing FDR < 0.1
# No PCs : 1
# PC1 : 0
# PC1 + batch: 0
# PC1 and 2 : 0
# PC1 to 5 : 0
# PC1 to 10 : 8
# Fraction of rRNA: 0
# Sex : 1
# Sex + Batch: 0
# Conclusion: Adding PCs does not improve the DE model
# DE sets dims
#dim(DE_b1) # 12360
#dim(DE_b2) # 12413
#dim(DE_b1_and_2) # 12610
#dim(DE_b1_and_2_batch_cor) # 12610
source("volcano_plot.R")
plot_grid(
volcano_plot(DE_b1, "Batch_1"),
volcano_plot(DE_b2, "Batch_2"),
volcano_plot(DE_b1_and_2, "Batch_1_and_2_combined"),
volcano_plot(DE_b1_and_2_batch_cor, "Batch_1_and_2_batch_corrected"),
nrow = 2
)
Y axis us autoscaled
library(ggrepel)
source("volcano_plot_Y_capped.R")
#volcano_plot_label_Y_capped(DE_b1, "Batch_1")
plot_grid(
volcano_plot_label_Y_capped(DE_b1, "Batch_1"),
volcano_plot_label_Y_capped(DE_b2, "Batch_2")
)
DE models include a sex covariate
datatable(dplyr::filter(DE_b1, PValue < 0.05), rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
datatable(dplyr::filter(DE_b2, PValue < 0.05), rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
plot_grid(
volcano_plot_label_Y_capped(DE_b1_and_2, "Batch_1_and_2_combined"),
volcano_plot_label_Y_capped(DE_b1_and_2_batch_cor, "Batch_1_and_2_batch_corrected")
)
DE models include a sex covariate. The batch_corrected model includes a batch covariate.
datatable(dplyr::filter(DE_b1_and_2, PValue < 0.05), rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
datatable(dplyr::filter(DE_b1_and_2_batch_cor, PValue < 0.05), rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
### No sex covariate = model not corrected for sex effects ###
plot_grid(
volcano_plot_label_Y_capped(DE_b1_no_sex, "Batch_1"),
volcano_plot_label_Y_capped(DE_b2_no_sex, "Batch_2")
)
DE models do not include sex covariate.
datatable(dplyr::filter(DE_b1_no_sex, PValue < 0.05), rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
datatable(dplyr::filter(DE_b2_no_sex, PValue < 0.05), rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
#volcano_plot_label_Y_capped(DE_b1_and_2_no_sex, "Batch_1_and_2_combined")
### Sex-specific DE ###
plot_grid(
volcano_plot_label_Y_capped(DE_b1_Male, "Batch 1 Male"),
volcano_plot_label_Y_capped(DE_b2_Male, "Batch 2 Male")
)
datatable(dplyr::filter(DE_b1_Male, PValue < 0.05), rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
datatable(dplyr::filter(DE_b2_Male, PValue < 0.05), rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
plot_grid(
volcano_plot_label_Y_capped(DE_b1_Female, "Batch 1 Female"),
volcano_plot_label_Y_capped(DE_b2_Female, "Batch 2 Female")
)
datatable(dplyr::filter(DE_b1_Female, PValue < 0.05), rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
datatable(dplyr::filter(DE_b2_Female, PValue < 0.05), rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
## Merge and compare DE signatures in the two batches
colnames(DE_b1) <- c("gene_id", "logFC_b1", "logCPM_b1", "LR_b1", "PValue_b1", "FDR_b1")
colnames(DE_b2) <- c("gene_id", "logFC_b2", "logCPM_b2", "LR_b2", "PValue_b2", "FDR_b2")
DE_1_and_2_merged <- merge(DE_b1, DE_b2, by = "gene_id") # 12191 genes
x <- DE_1_and_2_merged
test <- ifelse(x$PValue_b1, "Non significant")
test <- ifelse(x$PValue_b1 < 0.05, "PValue < 0.05 in B1", test)
test <- ifelse(x$PValue_b2 < 0.05, "PValue < 0.05 in B2", test)
test <- ifelse(x$PValue_b1 < 0.05 & x$PValue_b2 < 0.05, "PValue < 0.05 in B1 and B2", test)
DE_1_and_2_merged$DE_significance <- test
DE_1_and_2_merged_sig <- dplyr::filter(DE_1_and_2_merged, PValue_b1 < 0.05 | PValue_b2 < 0.05) # 1345 genes
# Cap at -1 and +1 log2FC
df <- DE_1_and_2_merged_sig
df$logFC_b1 <- ifelse(df$logFC_b1 < -1, -1, df$logFC_b1)
df$logFC_b1 <- ifelse(df$logFC_b1 > 1, 1, df$logFC_b1)
df$logFC_b2 <- ifelse(df$logFC_b2 < -1, -1, df$logFC_b2)
df$logFC_b2 <- ifelse(df$logFC_b2 > 1, 1, df$logFC_b2)
df$DE_significance <- as.factor(df$DE_significance)
library(ggrepel)
ggplot()+
geom_hline(yintercept = 0, linetype = 1)+
geom_vline(xintercept = 0, linetype = 1)+
geom_point(data = dplyr::filter(df, DE_significance != "PValue < 0.05 in B1 and B2"),
aes(x = logFC_b1, y = logFC_b2, color = DE_significance), alpha = 0.3)+
geom_point(data = dplyr::filter(df, DE_significance == "PValue < 0.05 in B1 and B2"),
aes(x = logFC_b1, y = logFC_b2, color = DE_significance), alpha = 0.8)+
theme_bw()+
#geom_text_repel(data = dplyr::filter(df, DE_significance == "PValue < 0.05 in B1 and B2"),
# aes(x = logFC_b1, y = logFC_b2, label = gene_id), color = "#2f6e31")+
labs(title = "Batch1 vs Batch2 log2 fold change plot of significantly DE genes \n indicates very little concordance between replicates. \n Plot displays 1345 genes passing P < 0.05 in either of the two batches.",
x = "Batch 1 [log2 fold change]",
y = "Batch 2 [log2 fold change]")+
theme(plot.title = element_text(hjust = 0.5))+
scale_color_manual(labels = c("Batch 1, P < 0.05",
"Batch 1 & 2, P < 0.05",
"Batch 2, P < 0.05"),
values = c("#F8766D",
"#7CAE00",
"#00BFC4"))
### Only genes with P<0.05 in both batches
ggplot()+
geom_hline(yintercept = 0, linetype = 1)+
geom_vline(xintercept = 0, linetype = 1)+
#geom_point(data = dplyr::filter(df, DE_significance != "PValue < 0.05 in B1 and B2"),
# aes(x = logFC_b1, y = logFC_b2, color = DE_significance), alpha = 0.3)+
#geom_point(data = dplyr::filter(df, DE_significance == "PValue < 0.05 in B1 and B2"),
# aes(x = logFC_b1, y = logFC_b2, color = DE_significance), alpha = 0.8)+
theme_bw()+
geom_text_repel(data = dplyr::filter(df, DE_significance == "PValue < 0.05 in B1 and B2"),
aes(x = logFC_b1, y = logFC_b2, label = gene_id), color = "#2f6e31")+
labs(title = "Batch1 vs Batch2 log2 fold change plot of significantly DE genes \n indicates very little concordance between replicates. \n Plot displays genes passing P < 0.05 in both batches.",
x = "Batch 1 [log2 fold change]",
y = "Batch 2 [log2 fold change]")+
theme(plot.title = element_text(hjust = 0.5))+
scale_color_manual(labels = c("Batch 1, P < 0.05",
"Batch 1 & 2, P < 0.05",
"Batch 2, P < 0.05"),
values = c("#F8766D",
"#7CAE00",
"#00BFC4"))
## Merge and compare DE signatures in the two batches
colnames(DE_b1_Male) <- c("gene_id", "logFC_b1", "logCPM_b1", "LR_b1", "PValue_b1", "FDR_b1")
colnames(DE_b2_Male) <- c("gene_id", "logFC_b2", "logCPM_b2", "LR_b2", "PValue_b2", "FDR_b2")
DE_1_and_2_merged <- merge(DE_b1_Male, DE_b2_Male, by = "gene_id") # 12005 genes
x <- DE_1_and_2_merged
test <- ifelse(x$PValue_b1, "Non significant")
test <- ifelse(x$PValue_b1 < 0.05, "PValue < 0.05 in B1", test)
test <- ifelse(x$PValue_b2 < 0.05, "PValue < 0.05 in B2", test)
test <- ifelse(x$PValue_b1 < 0.05 & x$PValue_b2 < 0.05, "PValue < 0.05 in B1 and B2", test)
DE_1_and_2_merged$DE_significance <- test
DE_1_and_2_merged_sig <- dplyr::filter(DE_1_and_2_merged, PValue_b1 < 0.05 | PValue_b2 < 0.05) # 1345 genes
# Cap at -1 and +1 log2FC
df <- DE_1_and_2_merged_sig
df$logFC_b1 <- ifelse(df$logFC_b1 < -1, -1, df$logFC_b1)
df$logFC_b1 <- ifelse(df$logFC_b1 > 1, 1, df$logFC_b1)
df$logFC_b2 <- ifelse(df$logFC_b2 < -1, -1, df$logFC_b2)
df$logFC_b2 <- ifelse(df$logFC_b2 > 1, 1, df$logFC_b2)
df$DE_significance <- as.factor(df$DE_significance)
library(ggrepel)
ggplot()+
geom_hline(yintercept = 0, linetype = 1)+
geom_vline(xintercept = 0, linetype = 1)+
geom_point(data = dplyr::filter(df, DE_significance != "PValue < 0.05 in B1 and B2"),
aes(x = logFC_b1, y = logFC_b2, color = DE_significance), alpha = 0.3)+
geom_point(data = dplyr::filter(df, DE_significance == "PValue < 0.05 in B1 and B2"),
aes(x = logFC_b1, y = logFC_b2, color = DE_significance), alpha = 0.8)+
theme_bw()+
#geom_text_repel(data = dplyr::filter(df, DE_significance == "PValue < 0.05 in B1 and B2"),
# aes(x = logFC_b1, y = logFC_b2, label = gene_id), color = "#2f6e31")+
labs(title = "Batch1 vs Batch2 log2 fold change plot of significantly DE genes \n in Males",
x = "Batch 1 [log2 fold change]",
y = "Batch 2 [log2 fold change]")+
theme(plot.title = element_text(hjust = 0.5))+
scale_color_manual(labels = c("Batch 1, P < 0.05",
"Batch 1 & 2, P < 0.05",
"Batch 2, P < 0.05"),
values = c("#F8766D",
"#7CAE00",
"#00BFC4"))
## Merge and compare DE signatures in the two batches
colnames(DE_b1_Female) <- c("gene_id", "logFC_b1", "logCPM_b1", "LR_b1", "PValue_b1", "FDR_b1")
colnames(DE_b2_Female) <- c("gene_id", "logFC_b2", "logCPM_b2", "LR_b2", "PValue_b2", "FDR_b2")
DE_1_and_2_merged <- merge(DE_b1_Female, DE_b2_Female, by = "gene_id") # 12005 genes
x <- DE_1_and_2_merged
test <- ifelse(x$PValue_b1, "Non significant")
test <- ifelse(x$PValue_b1 < 0.05, "PValue < 0.05 in B1", test)
test <- ifelse(x$PValue_b2 < 0.05, "PValue < 0.05 in B2", test)
test <- ifelse(x$PValue_b1 < 0.05 & x$PValue_b2 < 0.05, "PValue < 0.05 in B1 and B2", test)
DE_1_and_2_merged$DE_significance <- test
DE_1_and_2_merged_sig <- dplyr::filter(DE_1_and_2_merged, PValue_b1 < 0.05 | PValue_b2 < 0.05) # 1345 genes
# Cap at -1 and +1 log2FC
df <- DE_1_and_2_merged_sig
df$logFC_b1 <- ifelse(df$logFC_b1 < -1, -1, df$logFC_b1)
df$logFC_b1 <- ifelse(df$logFC_b1 > 1, 1, df$logFC_b1)
df$logFC_b2 <- ifelse(df$logFC_b2 < -1, -1, df$logFC_b2)
df$logFC_b2 <- ifelse(df$logFC_b2 > 1, 1, df$logFC_b2)
df$DE_significance <- as.factor(df$DE_significance)
library(ggrepel)
ggplot()+
geom_hline(yintercept = 0, linetype = 1)+
geom_vline(xintercept = 0, linetype = 1)+
geom_point(data = dplyr::filter(df, DE_significance != "PValue < 0.05 in B1 and B2"),
aes(x = logFC_b1, y = logFC_b2, color = DE_significance), alpha = 0.3)+
geom_point(data = dplyr::filter(df, DE_significance == "PValue < 0.05 in B1 and B2"),
aes(x = logFC_b1, y = logFC_b2, color = DE_significance), alpha = 0.8)+
theme_bw()+
#geom_text_repel(data = dplyr::filter(df, DE_significance == "PValue < 0.05 in B1 and B2"),
# aes(x = logFC_b1, y = logFC_b2, label = gene_id), color = "#2f6e31")+
labs(title = "Batch1 vs Batch2 log2 fold change plot of significantly DE genes \n in Females",
x = "Batch 1 [log2 fold change]",
y = "Batch 2 [log2 fold change]")+
theme(plot.title = element_text(hjust = 0.5))+
scale_color_manual(labels = c("Batch 1, P < 0.05",
"Batch 1 & 2, P < 0.05",
"Batch 2, P < 0.05"),
values = c("#F8766D",
"#7CAE00",
"#00BFC4"))
df$b1_Prank <- rank(df$PValue_b1)
df$b2_Prank <- rank(df$PValue_b2)
ggplot(df, aes(x = b1_Prank, y = b2_Prank))+
geom_point(alpha = 0.5)+
theme_bw()+
geom_text_repel(data = dplyr::filter(df, DE_significance == "PValue < 0.05 in B1 and B2"),
aes(x = b1_Prank, y = b2_Prank, label = gene_id), color = "#2f6e31")+
labs(title = "Batch 1 vs Batch 2 ranks of P values",
x = "Batch 1 P value",
y = "Batch 2 P value")
library(dplyr)
numbers_of_DE <- function(DE_b1, DE_b2, label_1, label_2){
B1_P_Up <- length(filter(DE_1_and_2_merged, logFC_b1 > 0, PValue_b1 < 0.05)$gene_id)
B1_P_Down <- length(filter(DE_1_and_2_merged, logFC_b1 < 0, PValue_b1 < 0.05)$gene_id)
B2_P_Up <- length(filter(DE_1_and_2_merged, logFC_b2 > 0, PValue_b2 < 0.05)$gene_id)
B2_P_Down <- length(filter(DE_1_and_2_merged, logFC_b2 < 0, PValue_b2 < 0.05)$gene_id)
B1_FDR_Up <- length(filter(DE_1_and_2_merged, logFC_b1 > 0, FDR_b1 < 0.1)$gene_id)
B1_FDR_Down <- length(filter(DE_1_and_2_merged, logFC_b1 < 0, FDR_b1 < 0.1)$gene_id)
B2_FDR_Up <- length(filter(DE_1_and_2_merged, logFC_b2 > 0, FDR_b2 < 0.1)$gene_id)
B2_FDR_Down <- length(filter(DE_1_and_2_merged, logFC_b2 < 0, FDR_b2 < 0.1)$gene_id)
DE_df_n <- t(data.frame(
"Batch_1" = c(B1_P_Up, B1_P_Down, B1_FDR_Up, B1_FDR_Down),
"Batch_2" = c(B2_P_Up, B2_P_Down, B2_FDR_Up, B2_FDR_Down)))
colnames(DE_df_n) <- c("Upregulated", "Downregulated", "Upregulated", "Downregulated")
DE_df_n_melted <- reshape::melt(DE_df_n)
DE_df_n_melted$stat <- c(rep(", P < 0.05", 4), rep(", FDR < 0.1", 4))
DE_df_n_melted$X2_stat <- paste(DE_df_n_melted$X2, DE_df_n_melted$stat)
#DE_df_n_melted
ggplot(DE_df_n_melted, aes(fill=X2_stat, group=X2, x=X1, y=value))+
geom_bar(position = "dodge", stat="identity", color="black")+
theme_bw()+
scale_fill_manual(values=c("#1F78B4", "#62a0ca", "#E31A1C", "#eb5e60"))+
theme(legend.title=element_blank())+
geom_text(aes(label=value), position=position_dodge(width=0.9), vjust = -0.7, hjust=0.5)+
#coord_flip()+
scale_x_discrete(limits = rev(levels(DE_df_n_melted$Var1)))+
theme_cowplot()+
theme(panel.border = element_blank(),
legend.title = element_blank(),
axis.ticks = element_blank(),
#axis.text = element_blank(),
#axis.text.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(legend.position="none")+
labs(title="Number of DE genes with P < 0.05 and FDR < 0.1", x="", y="")+
theme(plot.title = element_text(size = 12, hjust=0.5, face = "plain"))
}
numbers_of_DE(DE_b1, DE_b2, "Batch_1", "Batch_2")
DE models include a sex covariate
# save.image("G:/Shared drives/Nord Lab - Computational Projects/MAR_P2_integrated_batches_2021/session_12_09_21.RData")
#setwd("G:/Shared drives/Nord Lab - Computational Projects/MAR_P2_bulk_RNA-seq")
#write.csv(DE_b1, file = "Batch_1_with_sex_covariate_DE_results.csv")
#write.csv(DE_b2, file = "Batch_2_with_sex_covariate_DE_results.csv")
#write.csv(DE_b1_and_2_batch_cor, file = "Batch_1_with_sex_and_batch_covariates_DE_results.csv")
#write.csv(DE_b1_Male, file = "Batch_1_Male_DE_results.csv")
#write.csv(DE_b1_Female, file = "Batch_1_Female_DE_results.csv")
#write.csv(DE_b2_Male, file = "Batch_2_Male_DE_results.csv")
#write.csv(DE_b2_Female, file = "Batch_2_Female_DE_results.csv")
library(pander)
pander(sessionInfo())
R version 4.1.1 (2021-08-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
locale: LC_COLLATE=English_United States.1252, LC_CTYPE=English_United States.1252, LC_MONETARY=English_United States.1252, LC_NUMERIC=C and LC_TIME=English_United States.1252
attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: pander(v.0.6.4), dplyr(v.1.0.7), ggrepel(v.0.9.1), GenomicFeatures(v.1.44.2), GenomicRanges(v.1.44.0), GenomeInfoDb(v.1.28.4), org.Rn.eg.db(v.3.13.0), topGO(v.2.44.0), SparseM(v.1.81), GO.db(v.3.13.0), AnnotationDbi(v.1.54.1), IRanges(v.2.26.0), S4Vectors(v.0.30.2), Biobase(v.2.52.0), graph(v.1.70.0), BiocGenerics(v.0.38.0), DT(v.0.19), edgeR(v.3.34.1), limma(v.3.48.3), cowplot(v.1.1.1), reshape2(v.1.4.4) and ggplot2(v.3.3.5)
loaded via a namespace (and not attached): bitops(v.1.0-7), matrixStats(v.0.61.0), bit64(v.4.0.5), filelock(v.1.0.2), progress(v.1.2.2), httr(v.1.4.2), tools(v.4.1.1), bslib(v.0.3.1), utf8(v.1.2.2), R6(v.2.5.1), DBI(v.1.1.1), colorspace(v.2.0-2), withr(v.2.4.2), tidyselect(v.1.1.1), prettyunits(v.1.1.1), bit(v.4.0.4), curl(v.4.3.2), compiler(v.4.1.1), xml2(v.1.3.2), DelayedArray(v.0.18.0), labeling(v.0.4.2), rtracklayer(v.1.52.1), sass(v.0.4.0), scales(v.1.1.1), rappdirs(v.0.3.3), Rsamtools(v.2.8.0), stringr(v.1.4.0), digest(v.0.6.27), rmarkdown(v.2.11), XVector(v.0.32.0), pkgconfig(v.2.0.3), htmltools(v.0.5.2), MatrixGenerics(v.1.4.3), highr(v.0.9), dbplyr(v.2.1.1), fastmap(v.1.1.0), htmlwidgets(v.1.5.4), rlang(v.0.4.11), RSQLite(v.2.2.8), farver(v.2.1.0), jquerylib(v.0.1.4), BiocIO(v.1.2.0), generics(v.0.1.1), jsonlite(v.1.7.2), crosstalk(v.1.1.1), BiocParallel(v.1.26.2), RCurl(v.1.98-1.5), magrittr(v.2.0.1), GenomeInfoDbData(v.1.2.6), Matrix(v.1.3-4), Rcpp(v.1.0.7), munsell(v.0.5.0), fansi(v.0.5.0), lifecycle(v.1.0.1), stringi(v.1.7.5), yaml(v.2.2.1), SummarizedExperiment(v.1.22.0), zlibbioc(v.1.38.0), plyr(v.1.8.6), BiocFileCache(v.2.0.0), grid(v.4.1.1), blob(v.1.2.2), crayon(v.1.4.1), lattice(v.0.20-44), Biostrings(v.2.60.2), hms(v.1.1.1), KEGGREST(v.1.32.0), locfit(v.1.5-9.4), knitr(v.1.36), pillar(v.1.6.4), rjson(v.0.2.20), codetools(v.0.2-18), biomaRt(v.2.48.3), XML(v.3.99-0.8), glue(v.1.4.2), evaluate(v.0.14), png(v.0.1-7), vctrs(v.0.3.8), gtable(v.0.3.0), purrr(v.0.3.4), reshape(v.0.8.8), assertthat(v.0.2.1), cachem(v.1.0.5), xfun(v.0.24), restfulr(v.0.0.13), tibble(v.3.1.5), GenomicAlignments(v.1.28.0), memoise(v.2.0.0) and ellipsis(v.0.3.2)